library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(ggpubr)
library(Rtsne)
library(expss)
library(ClusterR)
library(DESeq2) ; library(biomaRt)
library(knitr)

Load preprocessed dataset (preprocessing code in 01_data_preprocessing.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame

# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)

# Update DE_info with Neuronal information
DE_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(GO_neuronal, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(significant=padj<0.05 & !is.na(padj))

rm(GO_annotations)


Mean Level of Expression


All samples together


  • There seem to be two different behaviours in mean expression by gene, which doesn’t happen when grouping them by sample
plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr))
p1 = plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
     xlab('Mean Expression') + ylab('Density') + ggtitle('Mean Expression distribution by Gene') +
     scale_x_log10() + theme_minimal()

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr))
p2 = plot_data %>% ggplot(aes(Mean)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) +
     xlab('Mean Expression') + ylab('Density') +
     theme_minimal() + ggtitle('Mean expression distribution by Sample')

grid.arrange(p1, p2, nrow=1)

rm(p1, p2, plot_data)

Grouping samples by Phenotype


The differences in level of expression between Phenotypes are not statistically significant

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% 
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID'))

p1 = plot_data %>% ggplot(aes(RNAExtractionBatch, Mean, fill = RNAExtractionBatch)) + 
     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
     stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
     xlab('Batch') + ylab('Mean Expression') + theme_minimal() + theme(legend.position = 'none')

p2 = plot_data %>% ggplot(aes(Sex, Mean, fill = Sex)) + 
     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
     stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
     xlab('Gender') + ylab('Mean Expression') + theme_minimal() + theme(legend.position = 'none')

p3 = plot_data %>% ggplot(aes(Brain_lobe, Mean, fill = Brain_lobe)) + 
     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
     stat_compare_means(label = 'p.signif') + xlab('Brain Lobe') + ylab('Mean Expression') + 
     theme_minimal() + theme(legend.position = 'none')

grid.arrange(p1,p2,p3, nrow = 1)

rm(p1,p2,p3)

Grouping genes by Neuronal tag and samples by Diagnosis


  • The two groups of genes seem to be partially characterised by genes with Neuronal function, but it doesn’t play such an important role as when we were considering also the non protein-coding genes

  • The heavy right tale from the sample distribution corresponds to a group of Autism samples. In general, the autism group has a bigger mean and is more spread out than the control group

  • Both differences in mean expression between Neuronal and non-neuronal genes and ASD and CTL samples are significant

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr)) %>% 
            left_join(GO_neuronal, by='ID') %>% mutate('Neuronal'=ifelse(is.na(Neuronal),F,T))

p1 = plot_data %>% ggplot(aes(Mean, color=Neuronal, fill=Neuronal)) + geom_density(alpha=0.3) +
                   scale_x_log10() + theme_minimal() + theme(legend.position='bottom') + 
                   ggtitle('Mean expression by gene')

p3 = plot_data %>% ggplot(aes(Neuronal, Mean, fill = Neuronal)) + 
                   geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
                   stat_compare_means(label = 'p.signif', method = 't.test', 
                                      method.args = list(var.equal = FALSE)) + theme_minimal() + 
                   ylab('Mean Expression') + theme(legend.position = 'none')

plot_data = data.frame('ID'=colnames(datExpr), 'Mean'=colMeans(datExpr)) %>% 
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID'))

p2 = plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + geom_density(alpha=0.3) +
                   theme_minimal() + theme(legend.position='bottom') + 
                   ggtitle('Mean expression by Sample')

p4 = plot_data %>% ggplot(aes(Diagnosis, Mean, fill = Diagnosis)) + 
                   geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
                   stat_compare_means(label = 'p.signif', method = 't.test', 
                                      method.args = list(var.equal = FALSE)) + theme_minimal() +
                   ylab('Mean Expression') + theme(legend.position = 'none')


grid.arrange(p1, p2, p3, p4, nrow=2)

rm(GO_annotations, plot_data, p1, p2, p3, p4)


Grouping genes and samples by Diagnosis

In general there doesn’t seem to be a lot of variance in mean expression between autism and control samples by gene

plot_data = data.frame('ID'=rownames(datExpr),
                       'ASD'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']),
                       'CTL'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD'])) %>%
                       mutate(diff=ASD-CTL, abs_diff = abs(ASD-CTL)) %>%
                       mutate(std_diff = (diff-mean(diff))/sd(diff), distance = abs((diff-mean(diff))/sd(diff)))

plot_data %>% ggplot(aes(ASD, CTL, color = distance)) + geom_point(alpha = plot_data$abs_diff) + geom_abline(color = 'gray') +
              scale_color_viridis(direction = -1) + ggtitle('Mean expression ASD vs CTL') + theme_minimal() + coord_fixed()

summary(plot_data$std_diff)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -5.67935 -0.59173 -0.04612  0.00000  0.51906  7.50525
#cat(paste0('Outlier genes: ', paste(plot_data$ID[abs(plot_data$std_diff)>3], collapse = ', ')))

There are 200 genes with a difference between Diagnoses larger than 3 SD to the distance distribution of all genes


  • There doesn’t seem to be a noticeable difference between mean expression by gene between Diagnosis groups

  • Samples with autism tend to have a wider dispersion of mean expression with higher values than the control group (as we had already seen above)

plot_data = rbind(data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
                  data.frame('Mean'=rowMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p1 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + 
              geom_density(alpha=0.3) + scale_x_log10() + theme_minimal())

plot_data = rbind(data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis=='ASD']), 'Diagnosis'='ASD'),
                  data.frame('Mean'=colMeans(datExpr[,datMeta$Diagnosis!='ASD']), 'Diagnosis'='CTL')) %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
p2 = ggplotly(plot_data %>% ggplot(aes(Mean, color=Diagnosis, fill=Diagnosis)) + 
              geom_density(alpha=0.3) + theme_minimal() +
              ggtitle('Mean expression by Gene (left) and by Sample (right) grouped by Diagnosis'))

subplot(p1, p2, nrows=1)
rm(p1, p2, plot_data)




Visualisations


Samples


PCA


The first principal component seems to separate almost perfectly the two diagnosis

pca = datExpr %>% t %>% prcomp

plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2]) %>% 
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>% 
            dplyr::select('ID','PC1','PC2','Diagnosis') %>% 
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))

plot_data %>% ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(alpha = 0.8) +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) +
              theme_minimal() + ggtitle('PCA of Samples')

rm(pca, plot_data)


MDS


Looks exactly the same as the PCA visualisation, just inverting both axis

d = datExpr %>% t %>% dist
fit = cmdscale(d, k=2)

plot_data = data.frame('ID'=colnames(datExpr), 'C1'=fit[,1], 'C2'=fit[,2]) %>%
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>% 
            dplyr::select('C1','C2','Diagnosis') %>%
            mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))

plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point(alpha = 0.8) + theme_minimal() + ggtitle('MDS')

rm(d, fit, plot_data)


t-SNE


Higher perplexities seem to capture the difference between Diagnosis better, although at the end they seem to be capturing another pattern as well, since the samples seem to be grouped in pairs

perplexities = c(2,5,10,25)
ps = list()

for(i in 1:length(perplexities)){
  set.seed(123)
  tsne = datExpr %>% t %>% Rtsne(perplexity=perplexities[i])
  plot_data = data.frame('ID'=colnames(datExpr), 'C1'=tsne$Y[,1], 'C2'=tsne$Y[,2]) %>%
              mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
              dplyr::select('C1','C2','Diagnosis','Subject_ID') %>%
              mutate('Diagnosis'=factor(Diagnosis, levels=c('CTL','ASD')))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=Diagnosis)) + geom_point() + theme_minimal() +
            ggtitle(paste0('Perplexity=',perplexities[i])) + theme(legend.position='none')
}

grid.arrange(grobs=ps, nrow=2)

rm(ps, perplexities, tsne, i)


The second pattern t-SNE seems to be capturing is the subject the samples belong to!

ggplotly(plot_data %>% ggplot(aes(C1, C2, color=Subject_ID)) + geom_point(aes(id=Subject_ID)) + theme_minimal() + 
         theme(legend.position='none') + ggtitle('t-SNE Perplexity=20 coloured by Subject ID'))
rm(plot_data)

Genes


PCA


  • The First Principal Component explains over 99% of the total variance

  • There’s a really strong correlation between the mean expression of a gene and the 1st principal component

pca = datExpr %>% prcomp

plot_data = data.frame( 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'MeanExpr'=rowMeans(datExpr))

plot_data %>% ggplot(aes(PC1, PC2, color=MeanExpr)) + geom_point(alpha=0.3) + theme_minimal() + 
              scale_color_viridis() + ggtitle('PCA') +
              xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
              ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)'))

rm(pca, plot_data)


t-SNE


Higher perplexities capture a cleaner visualisation of the data ordered by mean expression, in a similar (although not as linear) way to PCA

perplexities = c(1,2,5,10,50,100)
ps = list()

for(i in 1:length(perplexities)){
  tsne = read.csv(paste0('./../Visualisations/tsne_perplexity_',perplexities[i],'.csv'))
  plot_data = data.frame('C1'=tsne[,1], 'C2'=tsne[,2], 'MeanExpr'=rowMeans(datExpr))
  ps[[i]] = plot_data %>% ggplot(aes(C1, C2, color=MeanExpr)) + geom_point(alpha=0.5) + theme_minimal() +
            scale_color_viridis() + ggtitle(paste0('Perplexity = ',perplexities[i])) + theme(legend.position='none')
}

grid.arrange(grobs=ps, nrow=2)

rm(perplexities, ps, tsne, i)




Differential Expression Analysis


cro(DE_info$padj<0.05)
 #Total 
 DE_info$padj < 0.05 
   FALSE  11641
   TRUE  4491
   #Total cases  16132
p = DE_info %>% ggplot(aes(log2FoldChange, padj, color=significant)) + geom_point(alpha=0.2) + 
    scale_y_sqrt() + xlab('log2 Fold Change') + ylab('Adjusted p-value') + theme_minimal()
ggExtra::ggMarginal(p, type = 'density', color='gray', fill='gray', size=10)

rm(p)
plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% left_join(DE_info, by='ID') %>%
            mutate('statistically_Significant'=padj<0.05 & !is.na(padj))

plot_data %>% ggplot(aes(meanExpr, abs(log2FoldChange), color=statistically_Significant)) + 
              geom_point(alpha=0.1) + geom_smooth(method='lm') + theme_minimal() + scale_y_sqrt() + 
              theme(legend.position = 'bottom') + xlab('Mean Expression') + ylab('LFC Magnitude') + 
              ggtitle('Log fold change by level of expression')

datExpr_DE = datExpr[DE_info$significant,]

pca = datExpr_DE %>% prcomp

plot_data = cbind(data.frame('PC1'=pca$x[,1], 'PC2'=pca$x[,2]), DE_info[DE_info$significant==TRUE,])

pos_zero = -min(plot_data$log2FoldChange)/(max(plot_data$log2FoldChange)-min(plot_data$log2FoldChange))
p = plot_data %>% ggplot(aes(PC1, PC2, color=log2FoldChange)) + geom_point(alpha=0.5) +
                  scale_color_gradientn(colours=c('#F8766D','#faa49e','white','#00BFC4','#009499'), 
                                        values=c(0, pos_zero-0.05, pos_zero, pos_zero+0.05, 1)) +
                  theme_minimal() + ggtitle('
PCA of differentially expressed genes') + # This is on purpose, PDF doesn't save well without this white space (?)
                  xlab(paste0('PC1 (',round(100*summary(pca)$importance[2,1],1),'%)')) +
                  ylab(paste0('PC2 (',round(100*summary(pca)$importance[2,2],1),'%)')) + theme(legend.position = 'bottom')
ggExtra::ggMarginal(p, type='density', color='gray', fill='gray', size=10)

rm(pos_zero, p)

Separating the genes into these two groups: Salmon: under-expressed, aqua: over-expressed

plot_data = plot_data %>% mutate('Group'=ifelse(log2FoldChange>0,'overexpressed','underexpressed')) %>%
            mutate('Group' = factor(Group, levels=c('underexpressed','overexpressed')))

List of top DE genes

# Get genes names
getinfo = c('ensembl_gene_id','external_gene_id')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl', host='feb2014.archive.ensembl.org')
gene_names = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=plot_data$ID, mart=mart) %>% 
             rename(external_gene_id = 'gene_name', ensembl_gene_id = 'ID')
## Batch submitting query [======>------------------------] 22% eta: 1s
## Batch submitting query [=========>---------------------] 33% eta: 1s
## Batch submitting query [=============>-----------------] 44% eta: 1s
## Batch submitting query [================>--------------] 56% eta: 1s
## Batch submitting query [====================>----------] 67% eta: 1s Batch
## submitting query [=======================>-------] 78% eta: 0s Batch submitting
## query [===========================>---] 89% eta: 0s Batch submitting query
## [===============================] 100% eta: 0s
top_genes = plot_data %>% left_join(gene_names, by='ID') %>% arrange(-abs(log2FoldChange)) %>% 
            top_n(50, wt=log2FoldChange)

kable(top_genes %>% dplyr::select(ID, gene_name, log2FoldChange, padj, Neuronal, Group))
ID gene_name log2FoldChange padj Neuronal Group
ENSG00000013588 GPRC5A 1.9569959 0.0000000 0 overexpressed
ENSG00000198203 SULT1C2 1.9085997 0.0000291 0 overexpressed
ENSG00000111057 KRT18 1.8884113 0.0000005 0 overexpressed
ENSG00000027644 INSRR 1.8623331 0.0000525 0 overexpressed
ENSG00000106366 SERPINE1 1.6852362 0.0000002 0 overexpressed
ENSG00000164707 SLC13A4 1.6223298 0.0003213 0 overexpressed
ENSG00000101162 TUBB1 1.5356793 0.0005205 0 overexpressed
ENSG00000173110 HSPA6 1.5145204 0.0000000 0 overexpressed
ENSG00000126562 WNK4 1.5074204 0.0000033 0 overexpressed
ENSG00000064886 CHI3L2 1.4813180 0.0000852 0 overexpressed
ENSG00000124107 SLPI 1.4745353 0.0000984 0 overexpressed
ENSG00000112499 SLC22A2 1.4486810 0.0012480 0 overexpressed
ENSG00000176692 FOXC2 1.4446030 0.0004002 0 overexpressed
ENSG00000108821 COL1A1 1.4443531 0.0000037 0 overexpressed
ENSG00000158578 ALAS2 1.4424246 0.0017527 0 overexpressed
ENSG00000105641 SLC5A5 1.4160213 0.0009059 0 overexpressed
ENSG00000186564 FOXD2 1.3953829 0.0003680 0 overexpressed
ENSG00000144488 ESPNL 1.3876495 0.0000896 0 overexpressed
ENSG00000125735 TNFSF14 1.3570479 0.0020342 0 overexpressed
ENSG00000128342 LIF 1.3188127 0.0085773 1 overexpressed
ENSG00000112175 BMP5 1.2979929 0.0020426 0 overexpressed
ENSG00000170577 SIX2 1.2897652 0.0143515 0 overexpressed
ENSG00000145113 MUC4 1.2813362 0.0172754 0 overexpressed
ENSG00000106809 OGN 1.2792805 0.0005573 0 overexpressed
ENSG00000099937 SERPIND1 1.2756134 0.0022613 0 overexpressed
ENSG00000144644 GADL1 1.2719763 0.0080761 0 overexpressed
ENSG00000168229 PTGDR 1.2696143 0.0005787 0 overexpressed
ENSG00000168542 COL3A1 1.2484210 0.0000001 1 overexpressed
ENSG00000096696 DSP 1.2180727 0.0000049 0 overexpressed
ENSG00000171345 KRT19 1.2065835 0.0000544 0 overexpressed
ENSG00000149452 SLC22A8 1.2063915 0.0180433 0 overexpressed
ENSG00000086570 FAT2 1.2049936 0.0020150 0 overexpressed
ENSG00000136542 GALNT5 1.2022961 0.0013774 0 overexpressed
ENSG00000126778 SIX1 1.1812689 0.0000560 1 overexpressed
ENSG00000269113 TRABD2B 1.1662530 0.0008852 0 overexpressed
ENSG00000184357 HIST1H1B 1.1566548 0.0000860 0 overexpressed
ENSG00000174576 NPAS4 1.1510304 0.0051187 0 overexpressed
ENSG00000153404 PLEKHG4B 1.1383430 0.0000258 0 overexpressed
ENSG00000105825 TFPI2 1.1292093 0.0017849 0 overexpressed
ENSG00000175592 FOSL1 1.1175794 0.0057396 1 overexpressed
ENSG00000159212 CLIC6 1.1170706 0.0002658 0 overexpressed
ENSG00000175505 CLCF1 1.0821605 0.0115866 1 overexpressed
ENSG00000171819 ANGPTL7 1.0771146 0.0073214 0 overexpressed
ENSG00000173714 WFIKKN2 1.0598917 0.0021939 0 overexpressed
ENSG00000167244 IGF2 1.0525865 0.0002490 0 overexpressed
ENSG00000139219 COL2A1 1.0490629 0.0153584 0 overexpressed
ENSG00000182611 HIST1H2AJ 1.0478960 0.0067387 0 overexpressed
ENSG00000073792 IGF2BP2 1.0467431 0.0000005 0 overexpressed
ENSG00000159167 STC1 1.0004532 0.0000165 0 overexpressed
ENSG00000173068 BNC2 0.9872393 0.0000000 0 overexpressed
rm(top_genes)

Plotting the mean expression by group they seem to have two and three underlying distributions, respectively, so a Gaussian Mixture Model was fitted to each one to separate them into two/three Gaussians and then the points corresponding to each one were plotted in the original PCA plot.

gg_colour_hue = function(n) {
  hues = seq(15, 375, length = n + 1)
  hcl(h = hues, l = 65, c = 100)[1:n]
}

tot_n_clusters = 5

plot_data = plot_data %>% mutate('MeanExpr'=rowMeans(datExpr_DE), 'SDExpr'=apply(datExpr_DE,1,sd))

GMM_G1 = plot_data %>% filter(Group=='overexpressed') %>% dplyr::select(MeanExpr) %>% GMM(3)
GMM_G2 = plot_data %>% filter(Group=='underexpressed') %>% dplyr::select(MeanExpr) %>% GMM(2)

memberships_G1 = data.frame('ID'=plot_data$ID[plot_data$Group=='overexpressed'],
                            'Membership'=GMM_G1$Log_likelihood %>% apply(1, function(x) glue('over_', which.max(x))))
memberships_G2 = data.frame('ID'=plot_data$ID[plot_data$Group=='underexpressed'],
                            'Membership'=GMM_G2$Log_likelihood %>% apply(1, function(x) glue('under_', which.max(x))))

plot_data = rbind(memberships_G1, memberships_G2) %>% left_join(plot_data, by='ID')

p1 = plot_data %>% ggplot(aes(x=MeanExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) + 
     theme_minimal() + theme(legend.position='bottom')

p2 = plot_data %>% ggplot(aes(x=Group, y=MeanExpr, fill=Group)) + ylab('Mean Expression') + xlab('') +
     geom_boxplot(outlier.colour='gray', outlier.shape='o', outlier.size=3) + 
     stat_compare_means(label = 'p.signif', method = 't.test', method.args = list(var.equal = FALSE)) +
     theme_minimal() + theme(legend.position='none')

p3 = plot_data %>% ggplot(aes(x=MeanExpr)) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[1],
                   args=list(mean=GMM_G1$centroids[1], sd=GMM_G1$covariance_matrices[1])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[2],
                   args=list(mean=GMM_G1$centroids[2], sd=GMM_G1$covariance_matrices[2])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[3],
                   args=list(mean=GMM_G1$centroids[3], sd=GMM_G1$covariance_matrices[3])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[4],
                   args=list(mean=GMM_G2$centroids[1], sd=GMM_G2$covariance_matrices[1])) +
     stat_function(fun=dnorm, n=100, colour=gg_colour_hue(tot_n_clusters)[5],
                  args=list(mean=GMM_G2$centroids[2], sd=GMM_G2$covariance_matrices[2])) +
     theme_minimal()

p4 = plot_data %>% ggplot(aes(PC1, PC2, color=Membership)) + geom_point(alpha=0.4) + theme_minimal() + 
     theme(legend.position='bottom')

grid.arrange(p1, p2, p3, p4, nrow=2)

rm(gg_color_hue, n_clusters, GMM_G1, GMM_G2, memberships_G1, memberships_G2, p1, p2, p3, tot_n_clusters)

For previous preprocessing pipelines, the pattern found above was also present in the standard deviation, but there doesn’t seem to be any strong patterns now. This could be because the variance was almost homogenised with the vst normalisation algorithm.

plot_data %>% ggplot(aes(x=SDExpr, color=Group, fill=Group)) + geom_density(alpha=0.4) + theme_minimal()

rm(plot_data)



Effects of modifying the log fold change threshold

fc_list = seq(1, 1.25, 0.01)

n_genes = nrow(datExpr)

# Calculate PCAs
datExpr_pca_samps = datExpr %>% data.frame %>% t %>% prcomp(scale.=TRUE)
datExpr_pca_genes = datExpr %>% data.frame %>% prcomp(scale.=TRUE)

# Initialise DF to save PCA outputs
pcas_samps = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=colnames(datExpr), 'fc'=-1, PC1=scale(PC1), PC2=scale(PC2))
pcas_genes = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
             mutate('ID'=rownames(datExpr), 'fc'=-1, PC1=scale(PC1), PC2=scale(PC2))

pca_samps_old = pcas_samps
pca_genes_old = pcas_genes

for(fc in fc_list){
  
  # Recalculate DE_info with the new threshold (p-values change) an filter DE genes
  DE_genes = results(dds, lfcThreshold=log2(fc), altHypothesis='greaterAbs') %>% data.frame %>%
             mutate('ID'=rownames(.)) %>% filter(padj<0.05)
  
  datExpr_DE = datExpr %>% data.frame %>% filter(rownames(.) %in% DE_genes$ID)
  n_genes = c(n_genes, nrow(DE_genes))
  
  # Calculate PCAs
  datExpr_pca_samps = datExpr_DE %>% t %>% prcomp(scale.=TRUE)
  datExpr_pca_genes = datExpr_DE %>% prcomp(scale.=TRUE)

  # Create new DF entries
  pca_samps_new = datExpr_pca_samps$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=colnames(datExpr), 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))
  pca_genes_new = datExpr_pca_genes$x %>% data.frame %>% dplyr::select(PC1:PC2) %>% 
                  mutate('ID'=DE_genes$ID, 'fc'=fc, PC1=scale(PC1), PC2=scale(PC2))  
  
  # Change PC sign if necessary
  if(cor(pca_samps_new$PC1, pca_samps_old$PC1)<0) pca_samps_new$PC1 = -pca_samps_new$PC1
  if(cor(pca_samps_new$PC2, pca_samps_old$PC2)<0) pca_samps_new$PC2 = -pca_samps_new$PC2
  if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC1,
         pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC1)<0){
    pca_genes_new$PC1 = -pca_genes_new$PC1
  }
  if(cor(pca_genes_new[pca_genes_new$ID %in% pca_genes_old$ID,]$PC2, 
         pca_genes_old[pca_genes_old$ID %in% pca_genes_new$ID,]$PC2 )<0){
    pca_genes_new$PC2 = -pca_genes_new$PC2
  }
  
  pca_samps_old = pca_samps_new
  pca_genes_old = pca_genes_new
  
  # Update DFs
  pcas_samps = rbind(pcas_samps, pca_samps_new)
  pcas_genes = rbind(pcas_genes, pca_genes_new)
  
}

# Add Diagnosis/SFARI score information
pcas_samps = pcas_samps %>% mutate('ID'=substring(ID,2)) %>% 
             left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>%
             dplyr::select(ID, PC1, PC2, fc, Diagnosis, Brain_lobe)
# pcas_genes = pcas_genes %>% left_join(SFARI_genes, by='ID') %>% 
#              mutate('score'=as.factor(`gene-score`)) %>%
#              dplyr::select(ID, PC1, PC2, lfc, score)

# Plot change of number of genes
ggplotly(data.frame('lfc'=log2(fc_list), 'n_genes'=n_genes[-1]) %>% ggplot(aes(x=lfc, y=n_genes)) + 
         geom_point() + geom_line() + theme_minimal() + xlab('Log Fold Change Magnitude') + ylab('DE Genes') +
         ggtitle('Number of Differentially Expressed genes when modifying filtering threshold'))
rm(fc_list, n_genes, fc, pca_samps_new, pca_genes_new, pca_samps_old, pca_genes_old, 
   datExpr_pca_samps, datExpr_pca_genes)


Samples

Note: PC values get smaller as LFC increases, so on each iteration the values were scaled so it would be easier to compare between frames

Coloured by Diagnosis:

  • LFC = -1 represents the whole set of genes, without any filtering by differential expression

  • Log Fold Changes between 0.06 and 0.18 seem to separate the samples by Diagnosis best

ggplotly(pcas_samps %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),2))) %>% 
         ggplot(aes(PC1, PC2, color=Diagnosis)) + geom_point(aes(frame=abs_lfc, ids=ID), alpha=0.7) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))


Coloured by brain region:

There doesn’t seem to be any recognisable pattern

ggplotly(pcas_samps %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),2))) %>% 
         ggplot(aes(PC1, PC2, color=Brain_lobe)) + geom_point(aes(frame=abs_lfc, ids=ID)) + 
         theme_minimal() + ggtitle('Samples PCA plot modifying filtering threshold'))


Genes

if(!'fcSign' %in% colnames(pcas_genes)){
  pcas_genes = pcas_genes %>% left_join(DE_info, by='ID') %>% 
               mutate(LFCSign = ifelse(log2FoldChange>0,'Positive','Negative')) 
}

ggplotly(pcas_genes %>% mutate(abs_lfc=ifelse(fc==-1,-1,round(log2(fc),2))) %>% 
         ggplot(aes(PC1, PC2, color=LFCSign)) + geom_point(aes(frame=abs_lfc, ids=ID), alpha=0.2) + 
         theme_minimal() + ggtitle('Genes PCA plot modifying LFC Magnitude filtering threshold'))




Session info

sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.28                  biomaRt_2.40.5             
##  [3] DESeq2_1.24.0               SummarizedExperiment_1.14.1
##  [5] DelayedArray_0.10.0         BiocParallel_1.18.1        
##  [7] matrixStats_0.56.0          Biobase_2.44.0             
##  [9] GenomicRanges_1.36.1        GenomeInfoDb_1.20.0        
## [11] IRanges_2.18.3              S4Vectors_0.22.1           
## [13] BiocGenerics_0.30.0         ClusterR_1.2.1             
## [15] gtools_3.8.2                expss_0.10.2               
## [17] Rtsne_0.15                  ggpubr_0.2.5               
## [19] magrittr_1.5                GGally_1.5.0               
## [21] gridExtra_2.3               viridis_0.5.1              
## [23] viridisLite_0.3.0           RColorBrewer_1.1-2         
## [25] plotlyutils_0.0.0.9000      plotly_4.9.2               
## [27] glue_1.4.1                  reshape2_1.4.4             
## [29] forcats_0.5.0               stringr_1.4.0              
## [31] dplyr_1.0.0                 purrr_0.3.4                
## [33] readr_1.3.1                 tidyr_1.1.0                
## [35] tibble_3.0.1                ggplot2_3.3.2              
## [37] tidyverse_1.3.0            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.3.1           backports_1.1.8        Hmisc_4.4-0           
##   [4] plyr_1.8.6             lazyeval_0.2.2         splines_3.6.3         
##   [7] gmp_0.5-13.6           crosstalk_1.1.0.1      digest_0.6.25         
##  [10] htmltools_0.4.0        fansi_0.4.1            checkmate_2.0.0       
##  [13] memoise_1.1.0          cluster_2.1.0          annotate_1.62.0       
##  [16] modelr_0.1.6           prettyunits_1.1.1      jpeg_0.1-8.1          
##  [19] colorspace_1.4-1       blob_1.2.1             rvest_0.3.5           
##  [22] haven_2.2.0            xfun_0.12              crayon_1.3.4          
##  [25] RCurl_1.98-1.2         jsonlite_1.7.0         genefilter_1.66.0     
##  [28] survival_3.1-12        gtable_0.3.0           zlibbioc_1.30.0       
##  [31] XVector_0.24.0         scales_1.1.1           DBI_1.1.0             
##  [34] miniUI_0.1.1.1         Rcpp_1.0.4.6           xtable_1.8-4          
##  [37] progress_1.2.2         htmlTable_1.13.3       foreign_0.8-76        
##  [40] bit_1.1-15.2           Formula_1.2-3          htmlwidgets_1.5.1     
##  [43] httr_1.4.1             acepack_1.4.1          ellipsis_0.3.1        
##  [46] pkgconfig_2.0.3        reshape_0.8.8          XML_3.99-0.3          
##  [49] farver_2.0.3           nnet_7.3-14            dbplyr_1.4.2          
##  [52] locfit_1.5-9.4         later_1.0.0            tidyselect_1.1.0      
##  [55] labeling_0.3           rlang_0.4.6            AnnotationDbi_1.46.1  
##  [58] munsell_0.5.0          cellranger_1.1.0       tools_3.6.3           
##  [61] cli_2.0.2              generics_0.0.2         RSQLite_2.2.0         
##  [64] broom_0.5.5            fastmap_1.0.1          evaluate_0.14         
##  [67] yaml_2.2.1             bit64_0.9-7            fs_1.4.0              
##  [70] nlme_3.1-147           mime_0.9               ggExtra_0.9           
##  [73] xml2_1.2.5             compiler_3.6.3         rstudioapi_0.11       
##  [76] curl_4.3               png_0.1-7              ggsignif_0.6.0        
##  [79] reprex_0.3.0           geneplotter_1.62.0     stringi_1.4.6         
##  [82] highr_0.8              lattice_0.20-41        Matrix_1.2-18         
##  [85] vctrs_0.3.1            pillar_1.4.4           lifecycle_0.2.0       
##  [88] data.table_1.12.8      bitops_1.0-6           httpuv_1.5.2          
##  [91] R6_2.4.1               latticeExtra_0.6-29    promises_1.1.0        
##  [94] assertthat_0.2.1       withr_2.2.0            GenomeInfoDbData_1.2.1
##  [97] mgcv_1.8-31            hms_0.5.3              grid_3.6.3            
## [100] rpart_4.1-15           rmarkdown_2.1          shiny_1.4.0.2         
## [103] lubridate_1.7.4        base64enc_0.1-3